'''
Created on Aug 27, 2011

@author: philip
'''

from tools.BeautifulSoup import BeautifulStoneSoup
from Page import Page
from TextBlock import TextBlock
from Word import Word 

def process_document(document):
    bs = BeautifulStoneSoup(document)
    pages = bs.findAll("page")
    
    r = []
    for p in pages: r.append(process_page(p))
    return r 

def process_page(page):
    p = Page(int(page["number"]))
    
    text_blocks = page.findAll("para")
    for text_block in text_blocks: 
        p.add_text_block(process_text_block(text_block))
    
    return p
        
def process_text_block(text_block):
    tb = TextBlock()
    words = text_block.findAll("text")
    for word in words: 
        # figure out if current word is a bullet
        # approach
        # a word is a bullet if
        #  its length is 1
        #  the glyph associated with it has "unknown" attribute set to true
        
        if len(word.string) == 1:
            glyph = word.parent.findAll("glyph")[0] 
            if glyph.has_key("unknown"):
                if glyph["unknown"]: 
                    tb.add_word(Word("",True))
                    continue 
        
        tb.add_word(Word(word.string))
    
    return tb

